import os
from pandas.plotting import autocorrelation_plot
import warnings
import pandas as pd
import seaborn as sns
import requests
import plotly.express as px
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import plotly.graph_objects as go
from IPython.display import display
from scipy.stats import ks_2samp
# from TSPackages import *
from scipy.stats import jarque_bera
from scipy.stats import ks_2samp
from scipy.stats import kurtosis, skew
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
warnings.filterwarnings('ignore')
from pandas.plotting import autocorrelation_plot
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import seasonal_decompose
import matplotlib.pyplot as plt
import os3 ETL
file_path = [
r"C:\Users\Keyla Alba\OneDrive - Universidad del Norte\Doctorado (Ciencias)\Move La America\EDA\Datos\INMET_CO_MS_A710_PARANAIBA_01-01-2019_A_31-12-2019.CSV",
r"C:\Users\Keyla Alba\OneDrive - Universidad del Norte\Doctorado (Ciencias)\Move La America\EDA\Datos\INMET_CO_MS_A710_PARANAIBA_01-01-2020_A_31-12-2020.CSV",
r"C:\Users\Keyla Alba\OneDrive - Universidad del Norte\Doctorado (Ciencias)\Move La America\EDA\Datos\INMET_CO_MS_A710_PARANAIBA_01-01-2021_A_31-12-2021.CSV",
r"C:\Users\Keyla Alba\OneDrive - Universidad del Norte\Doctorado (Ciencias)\Move La America\EDA\Datos\INMET_CO_MS_A710_PARANAIBA_01-01-2022_A_31-12-2022.CSV",
r"C:\Users\Keyla Alba\OneDrive - Universidad del Norte\Doctorado (Ciencias)\Move La America\EDA\Datos\INMET_CO_MS_A710_PARANAIBA_01-01-2023_A_31-12-2023.CSV",
r"C:\Users\Keyla Alba\OneDrive - Universidad del Norte\Doctorado (Ciencias)\Move La America\EDA\Datos\INMET_CO_MS_A710_PARANAIBA_01-01-2024_A_31-12-2024.CSV",
r"C:\Users\Keyla Alba\OneDrive - Universidad del Norte\Doctorado (Ciencias)\Move La America\EDA\Datos\INMET_CO_MS_A710_PARANAIBA_01-01-2025_A_28-02-2025.CSV"
]
def load_and_clean(file_path):
df = pd.read_csv(file_path, sep=';', encoding='latin1', skiprows=8)
df.columns = [col.strip().upper() for col in df.columns]
for col in df.columns:
if "RADIACAO" in col and "KJ" in col:
df.rename(columns={col: "RADIACAO_GLOBAL"}, inplace=True)
break
df['HORA UTC'] = df['HORA UTC'].astype(str).str.replace(' UTC', '', regex=False)
df['HORA UTC'] = df['HORA UTC'].str.zfill(4)
df['HORA UTC'] = df['HORA UTC'].str[:2] + ':' + df['HORA UTC'].str[2:4]
df['datetime'] = pd.to_datetime(df['DATA'] + ' ' + df['HORA UTC'], format='%Y/%m/%d %H:%M', errors='coerce')
df['RADIACAO_GLOBAL'] = pd.to_numeric(df['RADIACAO_GLOBAL'], errors='coerce').fillna(0)
return df[['datetime', 'RADIACAO_GLOBAL']]
df_INMET = pd.concat([load_and_clean(fp) for fp in file_path], ignore_index=True)
print(df_INMET.shape)
df_INMET.head(n=14)(54024, 2)
| datetime | RADIACAO_GLOBAL | |
|---|---|---|
| 0 | 2019-01-01 00:00:00 | 0.0 |
| 1 | 2019-01-01 01:00:00 | 0.0 |
| 2 | 2019-01-01 02:00:00 | 0.0 |
| 3 | 2019-01-01 03:00:00 | 0.0 |
| 4 | 2019-01-01 04:00:00 | 0.0 |
| 5 | 2019-01-01 05:00:00 | 0.0 |
| 6 | 2019-01-01 06:00:00 | 0.0 |
| 7 | 2019-01-01 07:00:00 | 0.0 |
| 8 | 2019-01-01 08:00:00 | 0.0 |
| 9 | 2019-01-01 09:00:00 | 0.0 |
| 10 | 2019-01-01 10:00:00 | 0.0 |
| 11 | 2019-01-01 11:00:00 | 0.0 |
| 12 | 2019-01-01 12:00:00 | 0.0 |
| 13 | 2019-01-01 13:00:00 | 2434.0 |
fig_INMET = px.line(
df_INMET,
x='datetime',
y='RADIACAO_GLOBAL',
title='Radiación Global (KJ/m²) por Hora INMET (2019 a Febrero 2025)',
labels={
'datetime': 'Fecha y Hora',
'RADIACAO_GLOBAL': 'Radiación (KJ/m²)'
}
)
fig_INMET.update_layout(template='plotly_white', width=1100, height=500)
fig_INMET.show()3.0.0.1 Imputación usando promedio estacional
Uno de los retos fundamentales en el análisis de series temporales de radiación solar es la presencia de valores faltantes, especialmente en estaciones meteorológicas con limitaciones técnicas o períodos de inestabilidad climática. Una alternativa eficaz para abordar este problema es la Regla de Imputación Estacional, la cual consiste en reemplazar los datos perdidos utilizando el promedio de los valores registrados en la misma hora y día de otros años o meses, respetando así la estacionalidad y patrones cíclicos propios de la variable. Esta técnica resulta particularmente adecuada para variables como la radiación solar, que presenta una fuerte dependencia temporal y comportamiento periódico. En este contexto, analizaron el rendimiento de métodos univariantes de imputación bajo diferentes condiciones climáticas, concluyendo que las estrategias basadas en estacionalidad ofrecen ventajas significativas en climas tropicales. De manera complementaria, enfatizan que una adecuada imputación mejora sustancialmente la precisión de modelos de predicción basados en aprendizaje automático, al reducir la incertidumbre asociada al preprocesamiento de datos meteorológicos.
Regla de Imputación Estacional
Supongamos que tienes una serie de datos:
- \(R_i\): valor de radiación en la observación \(i\)
- \(d_i\): día del año de la observación \(i\) (de 1 a 366)
- \(h_i\): hora del día de la observación \(i\) (de 0 a 23)
Para cada observación con valor faltante \(R_i = 0\), la imputación es:
\[ \hat{R}_i = \frac{1}{n_{d_i,h_i}} \sum_{j \in G(d_i,h_i)} R_j \]
Donde:
- \(\hat{R}_i\) es el valor imputado para la observación \(i\)
- \(G(d_i, h_i)\) es el conjunto de observaciones \(j\) donde \(R_j > 0\), y el día y la hora coinciden con \(d_i, h_i\)
- \(n_{d_i,h_i}\) es el número de valores disponibles en ese grupo
Si no existen datos en ese grupo:
\[ \hat{R}_i = 0 \]
df_INMET['RADIACAO_GLOBAL_IMPUTADA'] = df_INMET['RADIACAO_GLOBAL'].replace(0, pd.NA)
df_INMET['datetime'] = pd.to_datetime(df_INMET['datetime'])
df_INMET['dayofyear'] = df_INMET['datetime'].dt.dayofyear
df_INMET['hour'] = df_INMET['datetime'].dt.hour
tabla_promedios = df_INMET.groupby(['dayofyear', 'hour'])['RADIACAO_GLOBAL_IMPUTADA'].mean()
def imputar_estacional(row):
if pd.isna(row['RADIACAO_GLOBAL_IMPUTADA']):
return tabla_promedios.get((int(row['dayofyear']), int(row['hour'])), 0)
else:
return row['RADIACAO_GLOBAL_IMPUTADA']
df_INMET['RADIACAO_GLOBAL_IMPUTADA'] = df_INMET.apply(imputar_estacional, axis=1)
df_INMET['RADIACAO_GLOBAL_IMPUTADA'] = df_INMET['RADIACAO_GLOBAL_IMPUTADA'].fillna(0)
print(df_INMET.shape)
df_INMET.tail(20)(54024, 5)
| datetime | RADIACAO_GLOBAL | RADIACAO_GLOBAL_IMPUTADA | dayofyear | hour | |
|---|---|---|---|---|---|
| 54004 | 2025-02-28 04:00:00 | 0.0 | 0.0 | 59 | 4 |
| 54005 | 2025-02-28 05:00:00 | 0.0 | 0.0 | 59 | 5 |
| 54006 | 2025-02-28 06:00:00 | 0.0 | 0.0 | 59 | 6 |
| 54007 | 2025-02-28 07:00:00 | 0.0 | 0.0 | 59 | 7 |
| 54008 | 2025-02-28 08:00:00 | 0.0 | 0.0 | 59 | 8 |
| 54009 | 2025-02-28 09:00:00 | 0.0 | 0.0 | 59 | 9 |
| 54010 | 2025-02-28 10:00:00 | 0.0 | 0.0 | 59 | 10 |
| 54011 | 2025-02-28 11:00:00 | 0.0 | 0.0 | 59 | 11 |
| 54012 | 2025-02-28 12:00:00 | 0.0 | 1749.0 | 59 | 12 |
| 54013 | 2025-02-28 13:00:00 | 0.0 | 0.0 | 59 | 13 |
| 54014 | 2025-02-28 14:00:00 | 0.0 | 2989.0 | 59 | 14 |
| 54015 | 2025-02-28 15:00:00 | 0.0 | 0.0 | 59 | 15 |
| 54016 | 2025-02-28 16:00:00 | 0.0 | 1930.0 | 59 | 16 |
| 54017 | 2025-02-28 17:00:00 | 0.0 | 0.0 | 59 | 17 |
| 54018 | 2025-02-28 18:00:00 | 0.0 | 3146.0 | 59 | 18 |
| 54019 | 2025-02-28 19:00:00 | 0.0 | 0.0 | 59 | 19 |
| 54020 | 2025-02-28 20:00:00 | 0.0 | 0.0 | 59 | 20 |
| 54021 | 2025-02-28 21:00:00 | 0.0 | 0.0 | 59 | 21 |
| 54022 | 2025-02-28 22:00:00 | 0.0 | 97.0 | 59 | 22 |
| 54023 | 2025-02-28 23:00:00 | 0.0 | 0.0 | 59 | 23 |
fig = go.Figure()
fig.add_trace(go.Scatter(
x=df_INMET['datetime'],
y=df_INMET['RADIACAO_GLOBAL_IMPUTADA'],
mode='lines',
name='INMET-IMPUTADA',
line=dict(color='orange'),
opacity=0.7
))
fig.add_trace(go.Scatter(
x=df_INMET['datetime'],
y=df_INMET['RADIACAO_GLOBAL'],
mode='lines',
name='INMET-SIN IMPUTAR',
line=dict(color='blue'),
opacity=0.7
))
fig.update_layout(
title='Evolución de Radiación INMET - Imputación Promedio Estacional (2019-2025)',
xaxis_title='Fecha y Hora',
yaxis_title='Radiación (KJ/m²)',
legend_title='Fuente',
template='plotly_white',
height=500,
width=1100
)
fig.show()rad_orig = df_INMET['RADIACAO_GLOBAL'][df_INMET['RADIACAO_GLOBAL'] > 0]
rad_imp = df_INMET['RADIACAO_GLOBAL_IMPUTADA'][df_INMET['RADIACAO_GLOBAL_IMPUTADA'] > 0]
fig, axes = plt.subplots(1, 2, figsize=(16, 5), sharey=True)
sns.histplot(rad_orig, bins=60, kde=True, color='royalblue', ax=axes[0])
axes[0].set_title('RADIACAO_GLOBAL (> 0)')
axes[0].set_xlabel('Radiación (KJ/m²)')
axes[0].set_ylabel('Frecuencia')
axes[0].grid(True)
sns.histplot(rad_imp, bins=60, kde=True, color='darkorange', ax=axes[1])
axes[1].set_title('RADIACAO_GLOBAL_IMPUTADA (> 0)')
axes[1].set_xlabel('Radiación (KJ/m²)')
axes[1].set_ylabel('')
axes[1].grid(True)
fig.suptitle('Distribuciones de Radiación Solar INMET (Imputación Promedio Estacional) (> 0) - Antes y Después ', fontsize=16)
plt.tight_layout()
plt.show()
rad_orig = df_INMET['RADIACAO_GLOBAL'][df_INMET['RADIACAO_GLOBAL'] > 0]
rad_imp = df_INMET['RADIACAO_GLOBAL_IMPUTADA'][df_INMET['RADIACAO_GLOBAL_IMPUTADA'] > 0]
stat_ks, p_ks = ks_2samp(rad_orig, rad_imp)
print("Kolmogorov-Smirnov Test (Distribución Original vs Imputada):")
print(f"Estadístico KS: {stat_ks:.4f}")
print(f"p-valor: {p_ks:.4e}")
if p_ks > 0.05:
print("No se rechaza H0 → Las distribuciones son estadísticamente similares.")
else:
print("Se rechaza H0 → Las distribuciones son diferentes.")
Kolmogorov-Smirnov Test (Distribución Original vs Imputada):
Estadístico KS: 0.0153
p-valor: 8.3915e-01
No se rechaza H0 → Las distribuciones son estadísticamente similares.
3.0.0.2 EDA
En el contexto de la predicción de radiación solar mediante modelos de series de tiempo, es fundamental conservar la estructura completa de la serie, incluyendo los valores de radiación nocturna igual a cero. El mantenimiento de esta continuidad temporal asegura que los algoritmos puedan captar correctamente la dinámica estacional y los ciclos diarios, evitando la introducción de sesgos por omisión de periodos sin registros. Este enfoque es especialmente relevante al trabajar con modelos como redes neuronales recurrentes, regresión de soporte vectorial o procesos gaussianos, los cuales requieren secuencias homogéneas y completas para una convergencia adecuada y generalización efectiva .
Adicionalmente, investigaciones que comparan arquitecturas híbridas de redes neuronales han resaltado que la inclusión de todos los datos horarios incluyendo los correspondientes a la noche contribuye a un entrenamiento más robusto y coherente, permitiendo una mejor adaptación a diferentes contextos geográficos y meteorológicos .
3.0.0.2.1 Horas
df_INMET.to_csv('df_INMET.csv', index=False)df_INMET['datetime'] = pd.to_datetime(df_INMET['datetime'])
fig = px.line(df_INMET,
x='datetime',
y='RADIACAO_GLOBAL_IMPUTADA',
title='Time Series - Global Radiation',
labels={'datetime': 'Date', 'RADIACAO_GLOBAL_IMPUTADA': 'Radiation'})
fig.update_layout(
xaxis_title='Fecha',
yaxis_title='Radiación',
template='plotly_white',
width=1000,
height=500
)
fig.show()df_INMET['datetime'] = pd.to_datetime(df_INMET['datetime'])
df_INMET = df_INMET.set_index('datetime')
os.makedirs("figures", exist_ok=True)
plt.figure(figsize=(14, 5))
plt.plot(df_INMET['RADIACAO_GLOBAL_IMPUTADA'], color='blue')
plt.title('Time Series - Global Radiation', fontsize=18)
plt.xlabel('Date', fontsize=14)
plt.ylabel('Radiation (W/m²)', fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig("figures/Global_Radiation_TimeSeries.png", dpi=300)
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
import os
año_inicio = 2019
fechas_completas = pd.date_range(start=f"{año_inicio}-01-01 00:00:00", periods=len(df_INMET), freq='H')
df_INMET['datetime'] = fechas_completas
df_INMET = df_INMET.set_index('datetime')
fig, ax = plt.subplots(figsize=(14, 5))
ax.plot(df_INMET.index, df_INMET['RADIACAO_GLOBAL_IMPUTADA'], color='blue')
ax.set_title('Time Series - Global Radiation', fontsize=18)
ax.set_xlabel('Date', fontsize=14)
ax.set_ylabel('Radiation (W/m²)', fontsize=14)
ax.grid(True)
interval_start = pd.Timestamp("2020-01-10 00:00:00")
interval_end = pd.Timestamp("2020-01-15 00:00:00")
zoom_data = df_INMET.loc[interval_start:interval_end]
ax_inset = fig.add_axes([0.6, 0.45, 0.25, 0.4]) # [left, bottom, width, height]
ax_inset.plot(zoom_data.index, zoom_data['RADIACAO_GLOBAL_IMPUTADA'], color='red')
ax_inset.set_title('Zoom: Jan 10–15, 2020', fontsize=10)
ax_inset.grid(True)
ax_inset.set_xticks([])
ax_inset.set_yticks([])
ax_inset.set_xlabel('')
ax_inset.set_ylabel('')
os.makedirs("figures", exist_ok=True)
plt.tight_layout()
plt.savefig("figures/Global_Radiation_TimeSeries_Zoom.png", dpi=300)
plt.show()
variables = ['RADIACAO_GLOBAL_IMPUTADA']
resumen = {}
for var in variables:
data = df_INMET[var]
resumen[var] = {
'N_records': len(data),
'μ': data.mean(),
'σ': data.std(),
'y_min': data.min(),
'Q1': data.quantile(0.25),
'x̄': data.median(),
'Q3': data.quantile(0.75),
'y_max': data.max(),
'Kurtosis': data.kurtosis(),
'Skewness': data.skew()
}
df_resumen = pd.DataFrame(resumen)
df_resumen| RADIACAO_GLOBAL_IMPUTADA | |
|---|---|
| N_records | 54024.000000 |
| μ | 284.327790 |
| σ | 752.818778 |
| y_min | 0.000000 |
| Q1 | 0.000000 |
| x̄ | 0.000000 |
| Q3 | 0.000000 |
| y_max | 3924.000000 |
| Kurtosis | 6.500850 |
| Skewness | 2.742065 |
from statsmodels.tsa.seasonal import seasonal_decompose
descomposicion = seasonal_decompose(df_INMET['RADIACAO_GLOBAL_IMPUTADA'].tail(1000), model='additive', period=24)
descomposicion.plot()
plt.suptitle("Decomposition of Global Radiation (Daily Cycle)", fontsize=16)
plt.tight_layout()
plt.show()
desc_por_hora = df_INMET.groupby('hour')['RADIACAO_GLOBAL_IMPUTADA'].describe()
desc_por_hora| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| hour | ||||||||
| 0 | 2251.0 | 0.021324 | 0.291339 | 0.0 | 0.0 | 0.0 | 0.0 | 4.0 |
| 1 | 2251.0 | 0.024434 | 0.287252 | 0.0 | 0.0 | 0.0 | 0.0 | 5.0 |
| 2 | 2251.0 | 0.009329 | 0.167072 | 0.0 | 0.0 | 0.0 | 0.0 | 3.0 |
| 3 | 2251.0 | 0.032430 | 0.292600 | 0.0 | 0.0 | 0.0 | 0.0 | 4.0 |
| 4 | 2251.0 | 0.015549 | 0.184338 | 0.0 | 0.0 | 0.0 | 0.0 | 3.0 |
| 5 | 2251.0 | 0.011550 | 0.132832 | 0.0 | 0.0 | 0.0 | 0.0 | 2.0 |
| 6 | 2251.0 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 7 | 2251.0 | 0.005775 | 0.075792 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 8 | 2251.0 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 9 | 2251.0 | 0.822301 | 4.227157 | 0.0 | 0.0 | 0.0 | 0.0 | 51.0 |
| 10 | 2251.0 | 32.930920 | 96.786907 | 0.0 | 0.0 | 0.0 | 0.0 | 666.0 |
| 11 | 2251.0 | 185.690582 | 346.654214 | 0.0 | 0.0 | 0.0 | 262.0 | 1656.0 |
| 12 | 2251.0 | 443.797201 | 687.757224 | 0.0 | 0.0 | 0.0 | 913.0 | 2524.0 |
| 13 | 2251.0 | 593.693470 | 917.315057 | 0.0 | 0.0 | 0.0 | 1498.0 | 3033.0 |
| 14 | 2251.0 | 901.893159 | 1242.582789 | 0.0 | 0.0 | 0.0 | 2175.0 | 3425.0 |
| 15 | 2251.0 | 935.042870 | 1337.877727 | 0.0 | 0.0 | 0.0 | 2449.0 | 3793.0 |
| 16 | 2251.0 | 1129.770546 | 1418.127255 | 0.0 | 0.0 | 0.0 | 2611.0 | 3924.0 |
| 17 | 2251.0 | 834.166148 | 1219.894113 | 0.0 | 0.0 | 0.0 | 2212.0 | 3782.0 |
| 18 | 2251.0 | 775.424256 | 1051.873855 | 0.0 | 0.0 | 0.0 | 1827.0 | 3568.0 |
| 19 | 2251.0 | 499.856952 | 767.244822 | 0.0 | 0.0 | 0.0 | 1114.0 | 2760.0 |
| 20 | 2251.0 | 344.708796 | 527.554926 | 0.0 | 0.0 | 0.0 | 737.0 | 1999.0 |
| 21 | 2251.0 | 127.717015 | 245.809561 | 0.0 | 0.0 | 0.0 | 185.0 | 1276.0 |
| 22 | 2251.0 | 18.125722 | 60.460133 | 0.0 | 0.0 | 0.0 | 1.0 | 639.0 |
| 23 | 2251.0 | 0.106619 | 0.705033 | 0.0 | 0.0 | 0.0 | 0.0 | 9.0 |
descomposicion = seasonal_decompose(df_INMET['RADIACAO_GLOBAL_IMPUTADA'].tail(1000), model='additive', period=24)
fig = descomposicion.plot()
fig.suptitle("Decomposition of Global Radiation (Daily Cycle)", fontsize=16)
plt.tight_layout()
fig.savefig('figures/decomposition_global_radiation.png')
plt.show()
os.makedirs('figures', exist_ok=True)
media_por_hora = df_INMET.groupby('hour')['RADIACAO_GLOBAL_IMPUTADA'].mean().reset_index()
plt.figure(figsize=(8, 6))
sns.lineplot(data=media_por_hora, x='hour', y='RADIACAO_GLOBAL_IMPUTADA', marker='o')
plt.title('Average Global Radiation by Hour', fontsize=16)
plt.xlabel('Hour of Day', fontsize=12)
plt.ylabel('Global Radiation', fontsize=12)
plt.xticks(range(0, 24))
plt.grid(True)
plt.savefig('figures/avg_radiation_by_hour.png')
plt.show()
os.makedirs('figures', exist_ok=True)
if 'hour' not in df_INMET.columns:
df_INMET['hour'] = df_INMET.index.hour
fig = px.box(
df_INMET,
x='hour',
y='RADIACAO_GLOBAL_IMPUTADA',
labels={
'hour': 'Hour of Day',
'RADIACAO_GLOBAL_IMPUTADA': 'Global Radiation'
},
title='Distribution of Global Radiation by Hour of the Day'
)
fig.update_layout(
title_font_size=18,
xaxis_title_font_size=14,
yaxis_title_font_size=14
)
fig.write_html("figures/boxplot_radiation_by_hour_plotly.html")
fig.show()os.makedirs('figures', exist_ok=True)
if 'hour' not in df_INMET.columns:
df_INMET['hour'] = df_INMET.index.hour
plt.figure(figsize=(10, 6))
sns.boxplot(data=df_INMET, x='hour', y='RADIACAO_GLOBAL_IMPUTADA')
plt.title('Distribution of Global Radiation by Hour of the Day', fontsize=18)
plt.xlabel('Hour of Day', fontsize=14)
plt.ylabel('Global Radiation', fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('figures/boxplot_radiation_by_hour.png', dpi=300)
plt.show()
os.makedirs('figures', exist_ok=True)
plt.figure(figsize=(5, 4))
autocorrelation_plot(df_INMET['RADIACAO_GLOBAL_IMPUTADA'].tail(200))
plt.title("Autocorrelation - Global Radiation", fontsize=14)
plt.grid(True)
plt.savefig("figures/autocorrelation_global_radiation.png")
plt.show()
from statsmodels.graphics.tsaplots import plot_pacf
os.makedirs("figures", exist_ok=True)
plt.figure(figsize=(5, 4))
plot_pacf(df_INMET['RADIACAO_GLOBAL_IMPUTADA'].tail(2000), lags=100, method='ywm')
plt.title("PACF - Global Radiation (Hourly Series)", fontsize=12)
plt.grid(True)
plt.tight_layout()
plt.savefig("figures/pacf_hourly_compact.png")
plt.show()<Figure size 500x400 with 0 Axes>

3.0.0.3 EDA DIARIA
df_diario = df_INMET['RADIACAO_GLOBAL_IMPUTADA'].resample('D').mean()
variables = ['RADIACAO_GLOBAL_IMPUTADA']
resumen = {}
for var in variables:
data = df_diario
resumen[var] = {
'N_records': len(data),
'μ': data.mean(),
'σ': data.std(),
'y_min': data.min(),
'Q1': data.quantile(0.25),
'x̄': data.median(),
'Q3': data.quantile(0.75),
'y_max': data.max(),
'Kurtosis': data.kurtosis(),
'Skewness': data.skew()
}
df_resumen = pd.DataFrame(resumen)
df_resumen| RADIACAO_GLOBAL_IMPUTADA | |
|---|---|
| N_records | 2251.000000 |
| μ | 284.327790 |
| σ | 142.219504 |
| y_min | 0.000000 |
| Q1 | 187.916667 |
| x̄ | 263.104167 |
| Q3 | 369.541667 |
| y_max | 799.916667 |
| Kurtosis | 0.287621 |
| Skewness | 0.546791 |
os.makedirs("figures", exist_ok=True)
interval_start = "2022-01-01"
interval_end = "2022-03-01"
fig, ax = plt.subplots(figsize=(14, 5))
ax.plot(df_diario, color='blue')
ax.set_title('Daily Time Series - Global Radiation', fontsize=18)
ax.set_xlabel('Date', fontsize=12)
ax.set_ylabel('Global Radiation', fontsize=12)
ax.grid(True)
ax.axvspan(pd.to_datetime(interval_start), pd.to_datetime(interval_end),
color='gray', alpha=0.3, label='Zoom region')
ax_inset = fig.add_axes([0.55, 0.50, 0.3, 0.35]) # [x, y, width, height]
zoom_data = df_diario.loc[interval_start:interval_end]
ax_inset.plot(zoom_data, color='red')
ax_inset.set_title('Zoom: Jan–Mar 2022', fontsize=10)
ax_inset.set_xticks([])
ax_inset.set_yticks([])
plt.tight_layout()
plt.savefig("figures/daily_series_with_zoom_highlighted.png")
plt.show()
descomposicion = seasonal_decompose(df_diario.tail(200), model='additive', period=7)
descomposicion.plot()
plt.suptitle('Descomposición de Radiación Global (ciclo semanal - 7 días)', fontsize=16)
plt.tight_layout()
plt.show()
df_INMET['day_of_week'] = df_INMET.index.day_name()
desc_por_dia = df_INMET.groupby('day_of_week')['RADIACAO_GLOBAL_IMPUTADA'].describe()
orden_dias = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
desc_por_dia = desc_por_dia.loc[orden_dias]
desc_por_dia
desc_por_dia = df_INMET.groupby('day_of_week')['RADIACAO_GLOBAL_IMPUTADA'].describe()
orden_dias = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
desc_por_dia = desc_por_dia.loc[orden_dias]
desc_por_dia| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| day_of_week | ||||||||
| Monday | 7704.0 | 276.915390 | 740.072654 | 0.0 | 0.0 | 0.0 | 0.0 | 3924.0 |
| Tuesday | 7728.0 | 288.041990 | 760.135891 | 0.0 | 0.0 | 0.0 | 0.0 | 3924.0 |
| Wednesday | 7728.0 | 281.359645 | 747.620231 | 0.0 | 0.0 | 0.0 | 0.0 | 3924.0 |
| Thursday | 7728.0 | 284.264644 | 751.770750 | 0.0 | 0.0 | 0.0 | 0.0 | 3869.0 |
| Friday | 7728.0 | 285.984170 | 756.346878 | 0.0 | 0.0 | 0.0 | 0.0 | 3924.0 |
| Saturday | 7704.0 | 285.550299 | 753.603337 | 0.0 | 0.0 | 0.0 | 0.0 | 3924.0 |
| Sunday | 7704.0 | 288.171102 | 760.189282 | 0.0 | 0.0 | 0.0 | 0.0 | 3924.0 |
os.makedirs("figures", exist_ok=True)
average_by_day = df_INMET.groupby(df_INMET.index.day_name())['RADIACAO_GLOBAL_IMPUTADA'].mean().reindex(
['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
).reset_index()
average_by_day.columns = ['Day', 'Average Radiation']
plt.figure(figsize=(8, 6))
sns.lineplot(data=average_by_day, x='Day', y='Average Radiation', marker='o')
plt.title('Average Global Radiation by Day of the Week', fontsize=18)
plt.xlabel('Day of the Week', fontsize=14)
plt.ylabel('Global Radiation', fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig("figures/Average_Radiation_by_Day.png", dpi=300)
plt.show()
os.makedirs("figures", exist_ok=True)
daily_series = df_INMET['RADIACAO_GLOBAL_IMPUTADA'].resample('D').mean().dropna()
df_daily = daily_series.to_frame(name='RADIACAO_GLOBAL_IMPUTADA')
df_daily['Day'] = df_daily.index.day_name()
days_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
fig = px.box(
df_daily,
x='Day',
y='RADIACAO_GLOBAL_IMPUTADA',
category_orders={'Day': days_order},
labels={
'Day': 'Day of the Week',
'RADIACAO_GLOBAL_IMPUTADA': 'Global Radiation (Imputed)'
},
title='Distribution of Imputed Global Radiation by Day of the Week'
)
fig.update_layout(
title_font_size=18,
xaxis_title_font_size=14,
yaxis_title_font_size=14
)
fig.write_html("figures/Boxplot_GlobalRadiation_ByDay_Plotly.html")
fig.show()os.makedirs("figures", exist_ok=True)
daily_series = df_INMET['RADIACAO_GLOBAL_IMPUTADA'].resample('D').mean().dropna()
df_daily = daily_series.to_frame(name='RADIACAO_GLOBAL_IMPUTADA')
df_daily['Day'] = df_daily.index.day_name()
plt.figure(figsize=(10, 6))
sns.boxplot(
data=df_daily,
x='Day',
y='RADIACAO_GLOBAL_IMPUTADA',
order=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
)
plt.title('Distribution of Imputed Global Radiation by Day of the Week', fontsize=18)
plt.xlabel('Day of the Week', fontsize=14)
plt.ylabel('Global Radiation (Imputed)', fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig("figures/Boxplot_GlobalRadiation_ByDay.png", dpi=300)
plt.show()
plt.figure(figsize=(6, 4))
autocorrelation_plot(daily_series.tail(200))
plt.title("Autocorrelation - Global Radiation", fontsize=12)
plt.grid(True)
plt.tight_layout()
plt.savefig("figures/Autocorrelation_GlobalRadiation_Daily_200days.png", dpi=300)
plt.show()
plt.figure(figsize=(10, 4))
plot_pacf(daily_series.tail(2000), lags=150, method='ywm')
plt.title("PACF - Global Radiation (Daily Series)", fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig("figures/PACF_GlobalRadiation_DailySeries.png", dpi=300)
plt.show()<Figure size 1000x400 with 0 Axes>

3.0.0.3.1 EDA MENSUAL
df_mensual = df_INMET['RADIACAO_GLOBAL_IMPUTADA'].resample('M').mean().to_frame(name='Promedio_Radiacion_Mensual')
variables = ['Promedio_Radiacion_Mensual']
resumen = {}
for var in variables:
data = df_mensual[var]
resumen[var] = {
'N_records': len(data),
'μ': data.mean(),
'σ': data.std(),
'y_min': data.min(),
'Q1': data.quantile(0.25),
'x̄': data.median(),
'Q3': data.quantile(0.75),
'y_max': data.max(),
'Kurtosis': data.kurtosis(),
'Skewness': data.skew()
}
df_resumen = pd.DataFrame(resumen)
df_resumen| Promedio_Radiacion_Mensual | |
|---|---|
| N_records | 74.000000 |
| μ | 284.331526 |
| σ | 49.663561 |
| y_min | 225.783565 |
| Q1 | 249.974238 |
| x̄ | 274.986044 |
| Q3 | 313.849497 |
| y_max | 411.725806 |
| Kurtosis | 1.182565 |
| Skewness | 1.264723 |
plt.figure(figsize=(14, 5))
plt.plot(df_mensual, color='blue')
plt.title('Monthly Time Series - Global Radiation', fontsize=18)
plt.xlabel('Date', fontsize=18)
plt.ylabel('Radiation (W/m²)', fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.grid(True)
plt.tight_layout()
plt.savefig("figures/Monthly_GlobalRadiation_TimeSeries.png", dpi=300)
plt.show()
descomposicion = seasonal_decompose(df_mensual, model='additive', period=12)
descomposicion.plot()
plt.suptitle('Decomposition of Global Radiation (Monthly Cycle)', fontsize=16)
plt.tight_layout()
plt.show()
df_diario = df_INMET['RADIACAO_GLOBAL_IMPUTADA'].resample('D').mean()
df_diario = df_diario.to_frame(name='RADIACAO_GLOBAL_IMPUTADA')
df_diario['Mes'] = df_diario.index.month
agrupado_mes_calendario = df_diario.groupby('Mes')['RADIACAO_GLOBAL_IMPUTADA']
resumen_mensual_simple = pd.DataFrame({
'N_records': agrupado_mes_calendario.count(),
'μ': agrupado_mes_calendario.mean(),
'σ': agrupado_mes_calendario.std(),
'y_min': agrupado_mes_calendario.min(),
'Q1': agrupado_mes_calendario.quantile(0.25),
'x̄': agrupado_mes_calendario.median(),
'Q3': agrupado_mes_calendario.quantile(0.75),
'y_max': agrupado_mes_calendario.max(),
'Kurtosis': agrupado_mes_calendario.apply(pd.Series.kurtosis),
'Skewness': agrupado_mes_calendario.apply(pd.Series.skew)
})
resumen_mensual_simple.index = pd.to_datetime(resumen_mensual_simple.index, format='%m').month_name()
resumen_mensual_simple| N_records | μ | σ | y_min | Q1 | x̄ | Q3 | y_max | Kurtosis | Skewness | |
|---|---|---|---|---|---|---|---|---|---|---|
| Mes | ||||||||||
| January | 217 | 407.477151 | 163.626147 | 68.500000 | 283.104167 | 383.895833 | 516.208333 | 799.916667 | -0.371655 | 0.164949 |
| February | 198 | 317.083649 | 133.032529 | 53.875000 | 235.833333 | 304.166667 | 392.291667 | 659.645833 | 0.033322 | 0.372261 |
| March | 186 | 234.346550 | 127.571404 | 3.875000 | 154.541667 | 225.500000 | 298.250000 | 610.666667 | 1.327689 | 0.917040 |
| April | 180 | 237.857407 | 141.355477 | 0.583333 | 140.453125 | 232.500000 | 306.776042 | 596.875000 | 0.105675 | 0.537886 |
| May | 186 | 257.869848 | 113.049078 | 69.791667 | 160.177083 | 244.722222 | 350.583333 | 518.916667 | -0.768204 | 0.174336 |
| June | 180 | 230.266204 | 111.890502 | 0.000000 | 147.541667 | 226.770833 | 324.916667 | 430.125000 | -0.636302 | -0.359935 |
| July | 186 | 252.023746 | 137.279431 | 27.916667 | 168.791667 | 217.583333 | 357.541667 | 630.291667 | 0.256444 | 0.710706 |
| August | 186 | 277.110215 | 101.562718 | 61.750000 | 210.166667 | 274.187500 | 344.500000 | 494.375000 | -0.488741 | -0.054911 |
| September | 180 | 274.390278 | 129.997583 | 40.041667 | 181.416667 | 258.791667 | 339.291667 | 579.208333 | -0.158738 | 0.444753 |
| October | 186 | 321.683580 | 149.988278 | 60.333333 | 185.083333 | 332.604167 | 430.125000 | 668.291667 | -0.582618 | 0.048882 |
| November | 180 | 291.265625 | 137.199636 | 32.208333 | 196.562500 | 267.375000 | 364.041667 | 687.604167 | 0.801704 | 0.766444 |
| December | 186 | 284.581317 | 139.366372 | 28.833333 | 194.666667 | 253.104167 | 382.250000 | 743.041667 | 0.707201 | 0.739721 |
promedio_por_mes = df_INMET.groupby(df_INMET.index.month)['RADIACAO_GLOBAL_IMPUTADA'].mean()
promedio_por_mes.index = pd.to_datetime(promedio_por_mes.index, format='%m').month_name()
promedio_por_mes = promedio_por_mes.reset_index()
promedio_por_mes.columns = ['Month', 'Average Radiation']
orden_meses = ['January', 'February', 'March', 'April', 'May', 'June',
'July', 'August', 'September', 'October', 'November', 'December']
promedio_por_mes = promedio_por_mes.set_index('Month').loc[orden_meses].reset_index()
plt.figure(figsize=(12, 6))
sns.lineplot(data=promedio_por_mes, x='Month', y='Average Radiation', marker='o')
plt.title('Average Global Radiation by Month of the Year', fontsize=18)
plt.xlabel('Month of the Year', fontsize=14)
plt.ylabel('Global Radiation (Imputed)', fontsize=14)
plt.xticks(fontsize=13)
plt.yticks(fontsize=13)
plt.grid(True)
plt.tight_layout()
plt.savefig("figures/avg_radiation_by_month.png", dpi=300)
plt.show()
df_INMET['Month'] = df_INMET.index.month
df_INMET['Month'] = pd.to_datetime(df_INMET['Month'], format='%m').dt.month_name()
month_order = ['January', 'February', 'March', 'April', 'May', 'June',
'July', 'August', 'September', 'October', 'November', 'December']
os.makedirs("figures", exist_ok=True)
plt.figure(figsize=(14, 6))
sns.violinplot(
data=df_INMET,
x='Month',
y='RADIACAO_GLOBAL_IMPUTADA',
order=month_order,
inner='box'
)
plt.title('Distribution of Global Radiation by Month', fontsize=18)
plt.xlabel('Month', fontsize=16)
plt.ylabel('Global Radiation', fontsize=16)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig("figures/violinplot_radiation_by_month.png", dpi=300)
plt.show()
df_INMET['Month'] = pd.Categorical(df_INMET['Month'], categories=month_order, ordered=True)
fig = px.violin(
df_INMET,
x='Month',
y='RADIACAO_GLOBAL_IMPUTADA',
category_orders={'Month': month_order},
box=True,
points=False,
title='Distribution of Imputed Global Radiation by Month'
)
fig.update_layout(
title_font_size=18,
xaxis_title='Month',
yaxis_title='Global Radiation (Imputed)',
xaxis_tickangle=45,
font=dict(size=12)
)
fig.show()serie_mensual = df_INMET['RADIACAO_GLOBAL_IMPUTADA'].resample('M').mean()
plt.figure(figsize=(5, 4))
autocorrelation_plot(serie_mensual)
plt.title("Autocorrelation - Global Radiation (Monthly Series)", fontsize=12)
plt.grid(True)
plt.tight_layout()
plt.savefig("figures/autocorrelation_monthly_series.png", dpi=300)
plt.show()
plt.figure(figsize=(5, 4))
plot_pacf(serie_mensual, lags=30, method='ywm')
plt.title("PACF - Global Radiation (Monthly Series)", fontsize=12)
plt.grid(True)
plt.tight_layout()
plt.savefig("figures/pacf_monthly_series.png", dpi=300)
plt.show()<Figure size 500x400 with 0 Axes>
